1 Importing data

# Set the directory where your CSV files are located
data_folder <- "data/mocap_data_prepped"

# List all CSV files in the specified folder
csv_files <- list.files(data_folder, pattern = "*.csv", full.names = TRUE)

# Initialize an empty list to store the dataframes
list_of_dataframes <- list()

# Loop through each CSV file, read it into a dataframe, and store it in the list with its original file name
for (csv_file in csv_files) {
  # Extract the file name without the extension
  file_name <- tools::file_path_sans_ext(basename(csv_file))
  
  # Read the CSV file into a dataframe and assign it to the list with the file name as the list name
  list_of_dataframes[[file_name]] <- read.csv(csv_file)
}

2 Preprocessing

In our preprocessing we want to cover the following:

  1. Dataframe only contains numeric values
  2. NA handling
  3. Normalising (z-score/ standardisation)
  4. Plot covariance/correlation matrix

2.1 Converting to wideformat: only numeric values

We only want numeric values in our set. Therefore we convert to a wide format where each combination of marker and subject has their own xyz coordinates. We write a function to do this for us.

convert_to_wide_format <- function(df) {
  df %>%
    unite("marker_subject", marker, subject, remove = TRUE) %>%
    pivot_wider(
      names_from = marker_subject,
      values_from = c(x, y, z)
    )
}

Then we apply this function to each dataframe in the list_of_dataframes

list_of_dataframes <- lapply(list_of_dataframes, convert_to_wide_format)

Now we effectively have dataframes where each rows represents their own time-point with a row every .00300 seconds. We proceed with removing the columns that do not represent an x,y and z.

for (i in seq_along(list_of_dataframes)) {
  list_of_dataframes[[i]] <- list_of_dataframes[[i]] %>%
    select(-index, -condition, -group, -elapsed_time)
}
rm(i)

2.2 NA-Handling

For each time we see an NA, we add the last known value for each column. This method is called “last observation carries forward”. If there are no last known values, we fill the NAs with the next known value (next observation carries backwards).We start by making a function:

library(tidyr)

fill_NAs_with_LOCF_NOCB <- function(df) {
  # Apply Last Observation Carried Forward (LOCF)
  df_filled_locf <- apply(df, 2, function(x) {
    if (all(is.na(x))) {
      return(x)
    } else {
      return(na.locf(x, na.rm = FALSE))
    }
  })

  # Convert back to dataframe
  df_filled_locf <- as.data.frame(df_filled_locf)

  # Apply Next Observation Carried Backward (NOCB)
  df_filled_locf_nocb <- apply(df_filled_locf, 2, function(x) {
    if (all(is.na(x))) {
      return(x)
    } else {
      return(na.locf(x, fromLast = TRUE, na.rm = FALSE))
    }
  })

  # Convert back to dataframe and return
  return(as.data.frame(df_filled_locf_nocb))
}
list_of_dataframes <- lapply(list_of_dataframes, fill_NAs_with_LOCF_NOCB)

#quick na check
sapply(list_of_dataframes, function(df) any(is.na(df)))
##  group0_jointlead group0_leadfollow  group1_jointlead group1_leadfollow 
##             FALSE             FALSE             FALSE             FALSE 
## group12_jointlead  group2_jointlead  group3_jointlead group3_leadfollow 
##             FALSE             FALSE             FALSE             FALSE 
## group4_leadfollow  group6_jointlead group6_leadfollow group8_leadfollow 
##             FALSE             FALSE             FALSE             FALSE

There are no longer any NAs, and we proceed with standardization.

2.3 Standardisation and normalisation

Our subjects were facing eachother and mirroring eachother. We want to effectively normalise their points to simulate them being the same height (having the same skeleton) and standing in the same spot, facing the same direction.

standardize_and_normalize_dataframe <- function(df) {
  # Standardize each column of the dataframe
  standardized_df <- scale(df)
  
  # Normalize the standardized dataframe using min-max scaling
  min_max_normalized_df <- as.data.frame(apply(standardized_df, 2, function(x) {
    (x - min(x)) / (max(x) - min(x))
  }))
  
  return(min_max_normalized_df)
}

list_of_dataframes <- lapply(list_of_dataframes, standardize_and_normalize_dataframe)

2.4 Splitting the data:

list_of_dataframes_A = list()
list_of_dataframes_B = list()

for (df_name in names(list_of_dataframes)) {
    df = list_of_dataframes[[df_name]]

    # Columns ending with _A
    cols_A = grep("_A$", names(df), value = TRUE)
    if (length(cols_A) > 0) {
        list_of_dataframes_A[[paste0(df_name)]] = df[, cols_A]
    }

    # Columns ending with _B
    cols_B = grep("_B$", names(df), value = TRUE)
    if (length(cols_B) > 0) {
        list_of_dataframes_B[[paste0(df_name)]] = df[, cols_B]
    }
}

2.5 Covariance/correlation matrix

library(ggplot2)
library(tidyr)

# Function to create a correlation matrix plot using ggplot2
create_corr_plot <- function(corr_matrix, df_name, subject) {
    # Transforming the correlation matrix into a long format
    corr_data <- as.data.frame(as.table(corr_matrix))
    names(corr_data) <- c("Var1", "Var2", "Correlation")
    
    # Create the plot
    ggplot(corr_data, aes(Var1, Var2, fill = Correlation)) +
        geom_tile() +
        scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
        theme_ipsum() +  # Apply the theme_ipsum
        labs(
            title = "Correlation Matrix",
            subtitle = paste(df_name, subject)
        ) +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        coord_fixed()  # Ensure a 1:1 aspect ratio
}

# Looping through list_of_dataframes_A
for (i in seq_along(list_of_dataframes_A)) {
    df_name_A <- names(list_of_dataframes_A)[i]
    corr_matrix_A <- cor(list_of_dataframes_A[[i]])
    plot_A <- create_corr_plot(corr_matrix_A, df_name_A, "Subject A")
    print(plot_A)
}

# Looping through list_of_dataframes_B
for (i in seq_along(list_of_dataframes_B)) {
    df_name_B <- names(list_of_dataframes_B)[i]
    corr_matrix_B <- cor(list_of_dataframes_B[[i]])
    plot_B <- create_corr_plot(corr_matrix_B, df_name_B, "Subject B")
    print(plot_B)
}

3 Running PCA

# Load the necessary library for PCA
library(FactoMineR)

# Create empty lists to store PCA results and principal components
pca_results_list_A <- list()
pca_results_list_B <- list()
pca_components_list_A <- list()
pca_components_list_B <- list()

# Define the number of principal components to use
num_pca_components_to_use <- 2  # You can adjust this as needed

for (i in 1:16) {
    if (i <= length(list_of_dataframes_A)) {
        # Extract the dataframe and its name
        df_A <- list_of_dataframes_A[[i]]
        df_name_A <- names(list_of_dataframes_A)[i]
        
        # Perform PCA
        pca_A <- PCA(df_A, graph = FALSE)
        
        # Save PCA result in the list with dataframe name as key
        pca_results_list_A[[df_name_A]] <- pca_A
        
        # Extract and store principal components with dataframe name as key
pca_components_A <- pca_A$ind$coord[, 1:num_pca_components_to_use, drop = FALSE]
pca_components_list_A[[df_name_A]] <- pca_components_A
    }

    if (i <= length(list_of_dataframes_B)) {
        # Extract the dataframe and its name
        df_B <- list_of_dataframes_B[[i]]
        df_name_B <- names(list_of_dataframes_B)[i]
        
        # Perform PCA
        pca_B <- PCA(df_B, graph = FALSE)
        
        # Save PCA result in the list with dataframe name as key
        pca_results_list_B[[df_name_B]] <- pca_B
        
        # Extract and store principal components with dataframe name as key
        pca_components_B <- pca_B$ind$coord[, 1:num_pca_components_to_use, drop = FALSE]
pca_components_list_B[[df_name_B]] <- pca_components_B
    }
}

# Now you have two sets of lists:
# - pca_results_list_A and pca_results_list_B containing PCA results
# - pca_components_list_A and pca_components_list_B containing principal components
# with dataframe names as keys

rm(corr_matrix_A, corr_matrix_B, df, df_A, df_B, pca_A, pca_B)

3.1 Scree plots

library(factoextra)
library(hrbrthemes) # for theme_ipsum()

# Loop through each dataframe in list_of_dataframes
for (df_name in names(list_of_dataframes)) {
  # Extract the current dataframe
  g <- list_of_dataframes[[df_name]]

  # Compute correlation matrix and perform PCA
  corr_matrix <- cor(g)
  data.pca <- princomp(corr_matrix)

  # Create the scree plot
  scree_plot <- fviz_eig(data.pca,
                         ncp = 5,
                         addlabels = TRUE, 
                         barfill = global_fill_colour, 
                         barcolor = global_fill_colour, 
                         ggtheme = theme_ipsum()) +
               ylim(0, 90) +
               labs(subtitle = df_name) # Add dynamic subtitle

  # Display or save the plot
  print(scree_plot)
  # To save: ggsave(paste0("ScreePlot_", df_name, ".png"), scree_plot)
}

3.2 Exporting each components weight:

Using the information from the screeplots, we can deduce the weight we should assign to each component which will be useful in calculating a combined DTW score for the pair:

PCA_component_weights <- data.frame(
  groupid = c("group0_jointlead", 
            "group0_leadfollow",
            "group1_jointlead",
            "group1_leadfollow",
            "group12_jointlead",
            "group2_jointlead",
            "group3_jointlead",
            "group3_leadfollow",
            "group4_leadfollow",
            "group6_jointlead",
            "group6_leadfollow",
            "group8_leadfollow"),
  dimension_1_weight = c(0.59,
                         0.633,
                         0.51,
                         0.703,
                         0.754,
                         0.44,
                         0.764,
                         0.517,
                         0.577,
                         0.51,
                         0.625,
                         0.554),
  dimension_2_weight= c(0.202,
                        0.23,
                        0.387,
                        0.167,
                        0.14,
                        0.373,
                        0.178,
                        0.238,
                        0.306,
                        0.19,
                        0.163,
                        0.253)
)
write_csv(PCA_component_weights, "results/PCAcomponentweights.csv")

3.3 Bug fix: PCA sign ambiguity

Next we calculate the area between the components and pick the combination which results in the smallest area: in our case that must be the case where they are aligned. we do this to ensure there is no sign ambiguity when running PCA. We can justify doing this as the goal of the task was to mirror eachother, and it is highly unlikely any participant was doing the opposite of their partner.

# Function to calculate the area between two components, aligning their lengths
calculate_area_between_components <- function(comp1, comp2) {
  # Align lengths
  min_length <- min(nrow(comp1), nrow(comp2))
  comp1_aligned <- comp1[1:min_length, ]
  comp2_aligned <- comp2[1:min_length, ]

  # Calculate area
  sum(abs(comp1_aligned - comp2_aligned))
}

# Function to align a component based on the smallest area between lines
align_component_based_on_area <- function(component_A, component_B) {
  # Align component lengths for area calculation
  original_area <- calculate_area_between_components(component_A, component_B)
  flipped_area <- calculate_area_between_components(component_A, -component_B)

  if (flipped_area < original_area) {
    return(-component_B)
  } else {
    return(component_B)
  }
}

#THERE ARE ISSUERS WITH VERY LARGE VECTORS. IM TRYING TO COMBAT THAT PROBLEM HERE

batch_size <- 100  # Set a batch size that your system can handle
num_batches <- ceiling(length(pca_components_list_A) / batch_size)

for (batch in 1:num_batches) {
  start_index <- (batch - 1) * batch_size + 1
  end_index <- min(batch * batch_size, length(pca_components_list_A))

  for (i in start_index:end_index) {
    if (!is.null(pca_components_list_A[[i]]) && !is.null(pca_components_list_B[[i]])) {
      pca_components_list_B[[i]] <- align_component_based_on_area(pca_components_list_A[[i]], pca_components_list_B[[i]])
    }
  }

  # Optional: Save intermediate results to disk, clear memory, etc.
}


# Clean up
rm(list_of_dataframes_A, list_of_dataframes_B)

4 Visualising

We can now visualise how the principle components behave, and get an idea of how synchronised they are before having formally conducted DTW.

# Loop through each index of the component lists
for (i in seq_along(pca_components_list_A)) {
  # Extract components for Subject A and Subject B
  component_A <- pca_components_list_A[[i]]
  component_B <- pca_components_list_B[[i]]

  # Ensure both components have the same number of rows
  min_rows <- min(nrow(component_A), nrow(component_B))

  # Combine both components into a single dataframe
  combined_df <- data.frame(
    time = rep(1:min_rows, 2),
    value = c(component_A[1:min_rows, 1], component_B[1:min_rows, 1]),
    group = rep(c("Subject A", "Subject B"), each = min_rows)
  )

  # Create a time series plot for both subjects
  component_name_A <- names(pca_components_list_A)[i]
  component_name_B <- names(pca_components_list_B)[i]
  group_number <- paste(component_name_A)

  
  p <- ggplot(combined_df, aes(x = time, y = value, color = group)) +
    geom_line(size = 1) +
    labs(
      title = "Principle components", 
      subtitle = group_number,
         x = "Time", y = "Component Value", color = "Subject") +
    scale_color_manual(values = aesthetic_highlight_difference_palette)

  # Print the plot
  print(p)
}

OBS: LOOK AT ORIGINAL FOOTAGE FROM:

  • Group 12_leadfollow

  • Group 13_jointlead

  • Group 13_leadfollow

  • Group2_leadfollow

  • Group 6_jointlead

5 Saving principle components

All components are saved in a folder to later be processed using DTW.

library(fs)

# Create the "results/PCA" directory if it doesn't already exist
dir_create(path("results", "PCA"), recursive = TRUE)

# Function to save each matrix in the component list as a CSV with an optional suffix
save_component_list_as_csv <- function(component_list, dir_path, suffix = "") {
  # Ensure component_list has named indices
  if (is.null(names(component_list))) {
    stop("The component list must have named indices.")
  }

  for (comp_name in names(component_list)) {
    # Use the component name for the file name, appending the suffix
    file_path <- file.path(dir_path, paste0(comp_name, suffix, ".csv"))
    write.csv(component_list[[comp_name]], file_path, row.names = FALSE)
  }

  # Print a message to indicate completion
  cat("All components have been saved as CSV in the '", dir_path, "' directory.\n")
}

# Save pca_components_list_A and pca_components_list_B to "results/PCA" as CSV files
# The files from pca_components_list_A will have an '_A' suffix
save_component_list_as_csv(pca_components_list_A, "results/PCA", "_A")
## All components have been saved as CSV in the ' results/PCA ' directory.
# The files from pca_components_list_B can have no suffix or a different one (e.g., "_B")
save_component_list_as_csv(pca_components_list_B, "results/PCA", "_B")
## All components have been saved as CSV in the ' results/PCA ' directory.